---
title: "MA Analysis code"
output: html_document
date: "2024-01-02"
editor_options: 
  chunk_output_type: inline
---

#Loading in data
```{r loading excel file}
library(readxl)
coding <- read_excel("Combined coding dataset.xlsx")
summary(coding)
```



```{r making dataframe}
df <- as.data.frame(coding)
#Makes additional vars that are easier to type in
df$tstat <- df$`t statistic`
df$degf <- df$`Degrees of freedom`
df$fz <- df$`Fisher's Z`
df$varfz <- df$`Var Fisher's Z`
df$sefz <- df$`SE Fisher`
df$IDStudy = as.factor(df$IDStudy)
df$seprecision <- 1/df$sefz
df$obsno = 1:nrow(df)

#Used later on
z = qnorm((1 - 0.95)/2, lower.tail = FALSE)
```

#Summary statistics
```{r Distrubtions of tstats and FZs}
summary(df$tstat)
quantile(df$tstat, probs = seq(0, 1, 0.25), na.rm = TRUE, type=1)
summary(df$degf)
quantile(df$degf, probs = seq(0, 1, 0.25), na.rm = TRUE, type=1)
summary(df$fz)
quantile(df$fz, probs = seq(0, 1, 0.01), na.rm = TRUE, type=1)
```

```{r Characteristics of t-stats and FZs}
df$tstat1 = (df$tstat < -2)
df$tstat2 = (df$tstat >= -2 & df$tstat <= 2)
df$tstat3 = (df$tstat > 2)
summary(as.numeric(df$tstat1))
summary(as.numeric(df$tstat2))
summary(as.numeric(df$tstat3))
```

```{r Subset fisher's z characteristics - unweighted}
calculate_stats <- function(data) {
  mean_value <- mean(data)
  ci <- t.test(data)$conf.int
  stats <- c(
    Observations = length(data),
    Mean = mean_value,
    `95% CI Lower` = ci[1],
    `95% CI Upper` = ci[2]
  )
  return(stats)
}

combined_table <- data.frame()
#Total (have to add this in manually)
combined_table <- rbind(combined_table, calculate_stats(df$fz))
rownames(combined_table)[1] <- "All estimates"
#Subgroups
library(tibble)
for (i in c(12:13, 15:72)) {
  tempdf <- df[df[i] == 1, ]
  sum_stats <- calculate_stats(tempdf$fz)
  combined_table <- rbind(combined_table, sum_stats)
}
rownames(combined_table)[2:61] <- colnames(df[c(12:13, 15:72)])
colnames(combined_table) <- c("Observations", "Mean", "95% CI Lower", "95% CI Upper")

options(scipen=999)
combined_table
```

```{r Subset fisher's z characteristics - weighted}
#IF RUNNING FOR SECOND TIME (OR GET "'x' and 'w' must have the same length" ERROR, THEN RE-RUN ORIGINAL CELL GETTING DF)

library(dplyr)
studyests <- as.data.frame(df %>%
                          group_by(IDStudy) %>%
                          summarise(
                            numberests = length(fz)
                          ))
hist(studyests$numberests, breaks=80, main='Number of Estimates', freq = TRUE, xlab = "Estimates per study"); box()
df_t <- right_join(df, studyests[,c('IDStudy','numberests')], by=c('IDStudy'))




##Getting results
weighted_stats <- function(data) { #Did this slightly different to other method to get the CI using unweighted SE but weighted mean
  mean_value <- weighted.mean(data$fz, 1/data$numberests)
  z = qnorm((1 - 0.95)/2, lower.tail = FALSE)
  SEx <- sd(data$fz)/sqrt(length(data$fz))
  stats <- c(
    Observations = length(data$fz),
    Mean = mean_value,
    `95% CI Lower` = mean_value - (z * SEx),
    `95% CI Upper` = mean_value + (z * SEx)
  )
  return(stats)
}

weighted_table <- data.frame()
#Total
weighted_table <- rbind(weighted_table, weighted_stats(df_t))
rownames(weighted_table)[1] <- "All estimates"

#Subgroups
for (i in c(12:13, 15:72)) {
  tempdf <- df[df[i] == 1, ]
  tempstudyests <- as.data.frame(tempdf %>%
                          group_by(IDStudy) %>%
                          summarise(
                            numberests = length(fz)
                          ))
  tempdf <- right_join(tempdf, tempstudyests[,c('IDStudy','numberests')], by=c('IDStudy'))
  sum_stats <- weighted_stats(tempdf)
  weighted_table <- rbind(weighted_table, sum_stats)
}
rownames(weighted_table)[2:61] <- colnames(df_t[c(12:13, 15:72)])
colnames(weighted_table) <- c("Observations", "Mean", "95% CI Lower", "95% CI Upper")

weighted_table
```

#Histograms
```{r All data}
##Fisher's Z's:
fzsummary <- summary(df$fz)
hist(df$fz, breaks=80, main="Histogram of Fisher's Z's", freq = FALSE)
abline(v = fzsummary['Mean'], col = "red", lwd = 1)
##T-statistics:
tstatsummary <- summary(df$tstat)
hist(df$tstat, breaks=80, main="Histogram of T-Statistics", freq = FALSE)
abline(v = tstatsummary['Mean'], col = "red", lwd = 1)
```

```{r 1% outliers removed}
##Fisher's Z's:
fzsummary2 <- summary(df2$`Fisher's Z`)
hist(df2$`Fisher's Z`, breaks=80, main="Histogram of Fisher's Z's with outliers removed", freq = FALSE)
abline(v = fzsummary2['Mean'], col = "red", lwd = 1)
##T-statistics:
tstatsummary2 <- summary(df2$tstat)
hist(df2$tstat, breaks=80, main="Histogram of T-Statistics with outliers removed", freq = FALSE)
abline(v = tstatsummary2['Mean'], col = "red", lwd = 1)
```

```{r subset histograms}
##Developed vs developing:
devdf <- df[df$`developped countries only` == 1, ]
ndevdf <-  df[df$`developing countries only` == 1, ]
hist(devdf$fz, col=rgb(0,0,1,0.2), breaks=80, #xlim=c(0, 1),
     xlab="Fisher's Z", ylab='Frequency', main='Developped vs Developing Countries')
hist(ndevdf$fz, col=rgb(1,0,0,0.2), add=TRUE)
legend('topleft', c('Developped', 'Developing'),
       fill=c(rgb(0,0,1,0.2), rgb(1,0,0,0.2)))

##High skill vs low skill
hskilldf <- df[df$`high education skills People Only` == 1, ]
lskilldf <-  df[df$`Low skilled educated People Only` == 1, ]
hist(lskilldf$fz, col=rgb(1,0,0,1),breaks=80, #xlim=c(0, 1),
     xlab="Fisher's Z", ylab='Frequency', main='Low Skill vs High Skill Workers')
hist(hskilldf$fz, col=rgb(0,1,0,1),breaks=80, add=TRUE)
legend('topleft', c('Low Skill', 'High Skill'),
       fill=c(rgb(1,0,0,1), rgb(0,1,0,1)))

##Male vs Female
maledf <- df[df$Male == 1, ]
femaledf <-  df[df$Female == 1, ]
hist(maledf$fz, col=rgb(0,0,1,1),breaks=80, #xlim=c(0, 1),
     xlab="Fisher's Z", ylab='Frequency', main='Male vs Female Workers')
hist(femaledf$fz, col=rgb(0,1,0,1),breaks=40, add=TRUE)
legend('topleft', c('Male', 'Female'),
       fill=c(rgb(0,0,1,1), rgb(0,1,0,1)))

##Data level
library(dplyr)
levelhistdf <- df
levelhistdf <- levelhistdf %>%
  mutate(level = ifelse(`Individual level data` == 1, "Individual level",
                               ifelse(`Firm level data` == 1, "Firm level",
                                      ifelse(`Industry level data` == 1, "Industry level", 
                                             ifelse(`Regional level data` == 1, "Regional level", "Country level")))))
library(ggplot2)
ggplot(levelhistdf, aes(x = fz)) +
  geom_density(aes(color = level))

##IFR vs Non-IFR
ifrdf <- df[df$NIFR == 0, ]
nifrdf <-  df[df$NIFR == 1, ]
hist(ifrdf$fz, col=rgb(1,0,0,1),breaks=80, #xlim=c(0, 1),
     xlab="Fisher's Z", ylab='Frequency', main='IFR vs Non-IFR data')
hist(nifrdf$fz, col=rgb(0,0,1,1),breaks=20, add=TRUE)
legend('topright', c('IFR', 'Non-IFR'),
       fill=c(rgb(1,0,0,1), rgb(0,0,1,1)))

##Wage definition
###Value
valuehistdf <- df
valuehistdf <- valuehistdf %>%
  mutate(value = ifelse(Wages == 1, "Wages",
                        ifelse(Income == 1, "Income",
                               ifelse(Earnings == 1, "Earnings", "Wage Bill"))))
library(ggplot2)
ggplot(valuehistdf, aes(x = fz)) +
  geom_density(aes(color = value))
###Timeframe
timehistdf <- df
timehistdf <- timehistdf %>%
  mutate(time = ifelse(`Hourly value` == 1, "Hourly",
                        ifelse(`Weekly value` == 1, "Weekly",
                               ifelse(`Monthly value` == 1, "Monthly", 
                                      ifelse(`Annual value` == 1, "Yearly", "Unsure of timeframe")))))
ggplot(timehistdf, aes(x = fz)) +
  geom_density(aes(color = time))
##Robot definition
robhistdf <- df
robhistdf <- robhistdf %>%
  mutate(rob = ifelse(`Robots absolute number` == 1, "Absolute value",
                        ifelse(`Robots relative number` == 1, "Relative value", "Dummy variable")))
ggplot(robhistdf, aes(x = fz)) +
  geom_density(aes(color = rob))
```

#Fixed effects
```{r standard}
library(metafor)
library(clubSandwich)
FE  <- rma.mv(yi = fz, V = sefz^2, data = df, method ="FE")
print(summary(FE))
##Clustered
FE_c <- coef_test(FE, cluster=df$IDStudy, vcov = "CR2")
print(FE_c)

##Manual- no clustering, but used later on
df$w_FE <- 1/df$sefz^2
FE_pooled_effect <- sum(df$w_FE*df$fz)/sum(df$w_FE)
FE_pooled_effect
```

```{r with study weights}
df$numberests <- df_t$numberests
df$w_FE2 <- 1/(df$sefz^2*df$numberests)

library(metafor)
library(clubSandwich)
FE2  <- rma.mv(yi = fz, V = sefz^2, W = w_FE2, data = df, method ="FE")
print(summary(FE2))
##Clustered
FE_c2 <- coef_test(FE2, cluster=df$IDStudy, vcov = "CR2")
print(FE_c2)
```

```{r Standard - Correcting for publication bias}
FE_PET  <- rma.mv(yi = fz, V = sefz^2, mods = ~sefz ,data = df, method ="FE")
print(summary(FE_PET))
###Clustering by study
FE_PET_c <- coef_test(FE_PET, cluster=df$IDStudy, vcov = "CR2")
print(FE_PET_c)

FE_PEESE  <- rma.mv(yi = fz, V = sefz^2, mods = ~varfz ,data = df, method ="FE")
print(summary(FE_PEESE))
###Clustering by study
FE_PEESE_c <- coef_test(FE_PEESE, cluster=df$IDStudy, vcov = "CR2")
print(FE_PEESE_c)
```

```{r With study weights - Correcting for publication bias}
FE_PET2 <- rma.mv(yi = fz, V = sefz^2, mods = ~sefz, W = w_FE2 ,data = df, method ="FE")
print(summary(FE_PET2))
###Clustering by study
FE_PET_c2 <- coef_test(FE_PET4, cluster=df$IDStudy, vcov = "CR2")
print(FE_PET_c2)

FE_PEESE4  <- rma.mv(yi = fz, V = sefz^2, mods = ~varfz, W = w_FE2 ,data = df, method ="FE")
print(summary(FE_PEESE4))
###Clustering by study
FE_PEESE_c2 <- coef_test(FE_PEESE4, cluster=df$IDStudy, vcov = "CR2")
print(FE_PEESE_c2)
```

#Random effects
```{r standard}
REML  <- rma.mv(fz ~ 1, V = sefz^2, data = df, random = ~ 1 | obsno)
print(summary(REML))
##Clustering by study
RE_c <- coef_test(REML, cluster=df$IDStudy, vcov = "CR2")
print(RE_c)

#Manually - weights used later on
df$REweight1 <- 1/(REMLv2$sigma2+df$sefz^2)
REWLS <- lm(fz~1, weights=df$REweight1, data=df)
REeffects <- as.numeric(REWLS$coefficients)
REeffects
```

```{r With study weights}
df$REweight2 <- 1/((REML$sigma2+df$sefz^2)*df$numberests)
REML2  <- rma.mv(fz ~ 1, V = sefz^2, data = df,W = REweight2 ,random = ~ 1 | obsno)
print(summary(REML2))
RE_c2 <- coef_test(REML2, cluster=df$IDStudy, vcov = "CR2")
print(RE_c2)
```

```{r Standard - Correcting for publication bias}
RE_PET  <- rma.mv(yi = fz, V = sefz^2,mods = ~sefz, data = df ,random = ~ 1 | obsno)
print(summary(RE_PET))
###Clustering by study
RE_PET_c <- coef_test(RE_PET, cluster=df$IDStudy, vcov = "CR2")
print(RE_PET_c)
RE_PEESE  <- rma.mv(yi = fz, V = sefz^2,mods = ~varfz, data = df ,random = ~ 1 | obsno)
print(summary(RE_PEESE))
###Clustering by study
RE_PEESE_c <- coef_test(RE_PEESE, cluster=df$IDStudy, vcov = "CR2")
print(RE_PEESE_c)
```

```{r With study weights - Correcting for publication bias}
RE_PET_w  <- rma.mv(yi = fz, V = sefz^2,mods = ~sefz ,data = df,W = REweight2 ,random = ~ 1 | obsno)
print(summary(RE_PET_w))
###Clustering by study
RE_PET_c2 <- coef_test(RE_PET_w, cluster=df$IDStudy, vcov = "CR2")
print(RE_PET_c2)
RE_PEESE_w  <- rma.mv(yi = fz, V = sefz^2,mods = ~varfz ,data = df,W = REweight2 ,random = ~ 1 | obsno)
print(summary(RE_PEESE_w))
###Clustering by study
RE_PEESE_c2 <- coef_test(RE_PEESE_w, cluster=df$IDStudy, vcov = "CR2")
print(RE_PEESE_c2)
```

#OLS
```{r Standard OLS}
#No adjustment
OLS <- lm(formula = fz ~ 1, data = df)
OLS_C <- coef_test(OLS, cluster=df$IDStudy, vcov = "CR2")
print(OLS_C) #OLS, clustered

##adjusting for publication bias
OLS_pet <- lm(formula = fz ~ sefz, data = df)
OLS_pet_C <- coef_test(OLS_pet, cluster=df$IDStudy, vcov = "CR2")
print(OLS_pet_C) #OLS, clustered
OLS_peese <- lm(formula = fz ~ varfz, data = df)
OLS_peese_C <- coef_test(OLS_peese, cluster=df$IDStudy, vcov = "CR2")
print(OLS_peese_C) #OLS, clustered
```

```{r Weighted by number of observations per study}
OLS_w1 = 1/df$numberests
##No adjustment
OLS_studyw <- lm(formula = fz ~ 1, weights = OLS_w1, data = df)
OLS_studyw_c <- coef_test(OLS_studyw, cluster=df$IDStudy, vcov = "CR2")
print(OLS_studyw_c) #OLS, clustered

##adjusting for publication bias
OLS_pet_studyw <- lm(formula = fz ~ sefz, weights = OLS_w1, data = df)
OLS_pet_studyw_c <- coef_test(OLS_pet_studyw, cluster=df$IDStudy, vcov = "CR2")
print(OLS_pet_studyw_c) #OLS, clustered
OLS_peese_studyw <- lm(formula = fz ~ varfz, weights = OLS_w1, data = df)
OLS_peese_studyw_c <- coef_test(OLS_peese_studyw, cluster=df$IDStudy, vcov = "CR2")
print(OLS_peese_studyw_c) #OLS, clustered
```
#UWLS
```{r standard}
WLS_w = 1/df$varfz
##No adjustment
UWLS <- lm(formula = fz ~ 1, weights = WLS_w, data = df)
UWLS_c <- coef_test(UWLS, cluster=df$IDStudy, vcov = "CR2")
print(UWLS_c) #OLS, clustered

##adjusting for publication bias
OLS_pet_presw <- lm(formula = fz ~ sefz, weights = WLS_w, data = df)
OLS_pet_presw_c <- coef_test(OLS_pet_presw, cluster=df$IDStudy, vcov = "CR2")
print(OLS_pet_presw_c) #OLS, clustered
OLS_peese_presw <- lm(formula = fz ~ varfz, weights = WLS_w, data = df)
OLS_peese_presw_c <- coef_test(OLS_peese_presw, cluster=df$IDStudy, vcov = "CR2")
print(OLS_peese_presw_c) #OLS, clustered
```
```{r Weighted by number of observations per study}
UWLS_studyw <- lm(formula = fz ~ 1, weights = w_FE2, data = df)
UWLS_studyw_c <- coef_test(UWLS_studyw, cluster=df$IDStudy, vcov = "CR2")
print(UWLS_studyw_c) #OLS, clustered

##adjusting for publication bias
UWLS_pet_studyw <- lm(formula = fz ~ sefz, weights = w_FE2, data = df)
UWLS_pet_studyw_c <- coef_test(UWLS_pet_studyw, cluster=df$IDStudy, vcov = "CR2")
print(UWLS_pet_studyw_c) #OLS, clustered
UWLS_peese_studyw <- lm(formula = fz ~ varfz, weights = w_FE2, data = df)
UWLS_peese_studyw_c <- coef_test(UWLS_peese_studyw, cluster=df$IDStudy, vcov = "CR2")
print(UWLS_peese_studyw_c) #OLS, clustered
```


#Non-linear Techniques
```{r WAAP}
## an inverse-variance weighted average of all the estimates with power at least 80%
#UWLS as in https://www.sciencedirect.com/science/article/pii/S0895435623000471 is:
UWLS  <- lm(fz ~ 1, weights = 1/I(sefz^2), data = df)
summary(UWLS)
WAAP_bound <- abs(UWLS$coefficients[1])/2.8
WAAPdf <- df[df$sefz<WAAP_bound,]
WAAP_reg  <- lm(fz ~ 1, weights = 1/I(sefz^2), data = WAAPdf)
summary(WAAP_reg)

#Cannot cluster as only one study has estimates:
#WAAP_reg_cluster <- coef_test(WAAP_reg, cluster=WAAPdf$IDStudy, vcov = "CR2")
#print(WAAP_reg_cluster)
```

```{r Top10}
#a simple average of the 10% of the most precise estimates
T10_bound <- quantile(df$seprecision, probs = 0.9) #Setting the 90th quantile bound
T10df <- df[df$seprecision>T10_bound,]

T10_reg <- lm(formula = fz ~ 1, data = T10df)
summary(T10_reg)
T10_reg_cluster <- coef_test(T10_reg, cluster=T10df$IDStudy, vcov = "CR2")
print(T10_reg_cluster)
```

```{r Stem based method (Furukawa)}
#extends the previous one by endogenously determining what proportion of the most precise estimates to use.
source("stem_method.R") # from github.com/Chishio318/stem-based_method
est_stem <- stem(df$fz, df$sefz, param)
est_stem$estimates
funnels_stem <- stem_funnel(df$fz, df$sefz, est_stem$estimates)
```





```{r Endogenous kink}
#In Separate stata do file
```

#Endogeneity-robust techniques
```{r MAIVE}
# R code for MAIVE
#
# 1. Input as excel file:
#
#       estimates: bs
#       standard errors: sebs
#       number of observations: Ns
#       (optional: study_id)
#
# 2. Default option for MAIVE: MAIVE-PET-PEESE, unweighted, with instrumented SEs
#
#  Other available options to the user:
#       method= 1 FAT-PET, 2 PEESE, 3 PET-PEESE, 4 EK
#       weighting = 0 no weights, 1 standard weights, 2 adjusted weights  
#       instrumenting = 1 yes, 0 no 
#       correlation at study level: 0 none, 1 fixed effect dummies, 2 clusters
#
# 3. Output:
#
#       MAIVE meta-estimate and standard error
#       Hausman type test: comparison between MAIVE and standard version
#       (When instrumenting: heteroskedastic robust F-test of the first step)
#       Anderson-Rubin confidence interval for weak instruments (only for PET, PEESE or PET-PEESE)
#       the instrumented standard errors are saved as SE_instrumented and can be obtained as "MAIVE$SE_instrumented"

rm(list = ls())

# OPTIONS: 
# method: PET:1, PEESE:2, PET-PEESE:3, EK:4 (default 3)
  method <- 3
# weighting: default no weight: 0 ; weights: 1, adjusted weights: 2 (default 0)
  weight <- 0
# instrumenting (default 1)
  instrument <- 1 
# correlation at study level: none: 0 (default), fixed effects: 1, cluster: 2
  studylevel <-2
  # Anderson-Rubin confidence interval for weak instruments (only for unweighted MAIVE -- PET, PEESE or PET-PEESE): 0 no, 1 yes
  AR <-0
# default options are method=3; weight=0; instrument=1; studylevel=0; AR=0 

# CHOOSE DATASET
# This code, the function maivefunction.R and the excel file inputdata.xlsx must be in the same directory
  if (!require('rstudioapi')) install.packages('rstudioapi'); library('rstudioapi')
  if (!require('readxl')) install.packages('readxl'); library('readxl')
  setwd(dirname(getActiveDocumentContext()$path))
  dat <- read_excel("MAIVE_inputdata.xlsx")
  
  source("maivefunction.R")

# This is the actual execution of the code:
MAIVE=maive(dat=dat,method=method,weight=weight,instrument=instrument,studylevel=studylevel,AR=AR)
cat("\f")

object<-c("MAIVE coefficient","MAIVE standard error","F-test of first step in IV","Hausman-type test (to be used with caution)","Critical Value of Chi2(1)","AR Confidence interval")
value<-c(MAIVE$beta,MAIVE$SE,MAIVE$`F-test`,MAIVE$Hausman,MAIVE$Chi2,paste(MAIVE$AR_CI, collapse = " "))
MAIVEresults<-data.frame(object,value)
cat("\f")
MAIVEresults
```

```{r p-uniform*}
# a nonlinear model based on the statistical principle that p-values should be uniformly distributed at the mean underlying effect size

library(puniform)
# Groundwork
study_levels_h <- levels(as.factor(df$`Study name`))
nreg_h <- length(study_levels_h)

# Vector of medians for Fisher z's
med_fz <- c(rep(NA,nreg_h))
for (i in 1:nreg_h) {
  y <- median(df$fz[df$`Study name` == levels(as.factor(df$`Study name`))[i]])
  med_fz[i] <- y
}

# Vector of medians for Fisher z's SE
med_sefz <- c(rep(NA,nreg_h))
for (i in 1:nreg_h) {
  y <- median(df$sefz[df$`Study name` == levels(as.factor(df$`Study name`))[i]])
  med_sefz[i] <- y
}

#P-UNIFORM*
# Maximum likelihood
p_unistar_est <- puni_star(yi = med_fz, vi = med_sefz^2, side = "right", method = "ML", alpha = 0.05)
print(p_unistar_est)
p_unistar_est$est

#computed se based on 95% CI:
p_unistar_se <- (p_unistar_est$ci.ub - p_unistar_est$ci.lb)/(2*z)
p_unistar_se


#P-UNIFORM (NOT STAR)
# Maximum likelihood
p_uni_est <- puniform(yi = med_fz, vi = med_sefz^2, side = "right", method = "ML",
                      alpha = 0.05)
print(p_uni_est)

#computed se based on 95% CI:
p_uni_se <- (p_uni_est$ci.ub - p_uni_est$ci.lb)/(2*z)
p_uni_se
```

#Other tests
```{r Caliper test}
#The caliper test focuses on an important threshold of the t-statistic (typically 1.96) and compares the number of reported t-statistics just below and just above the threshold.
rng2 <- as.numeric(quantile(df$tstat, probs = c(0.01,0.99), na.rm = TRUE, type=1))
rng2

filter2 <- (df$tstat>rng2[1] & df$tstat<rng2[2]) #Removing the outliers from the graph

# histogram of t-statistic distribution
##Frequency on y-axis
(t_hist <- ggplot(data = df[filter2,], aes(x = tstat[filter2], )) +
  geom_histogram(color = "black", fill = "#1261ff", bins = 80) +
  geom_vline(aes(xintercept = mean(tstat)), color = "dark orange", linetype = "dashed", linewidth = 0.7) + 
  geom_vline(aes(xintercept = -1.96), color = "red", size = 0.5) +
  geom_vline(aes(xintercept = 1.96), color = "red", size = 0.5) +
  labs(x = "T-statistic", y = "Frequency")) +
  scale_x_continuous(breaks = c(-12, -10, -8, -6, -4, -1.96, 0, 1.96, 4, 6, 8), label = c(-12, 10, 8, 6, 4, -1.96, 0, 1.96, 4, 6, 8)) 
##Density on y-axis
hist(df$tstat[filter2], breaks=80, main="Histogram of T-stats", freq = FALSE)
abline(v = mean(df$tstat[filter2]), col = "blue", lwd = 1)
abline(v = -1.96, col = "red", lwd = 1)
abline(v = 1.96, col = "red", lwd = 1)

# 1.96 bound Caliper tests
df$significant_pos <- c(rep(0,nrow(df)))
df$significant_pos[df$tstat> 1.96] <- 1 #laying the groundwork for the regression
#Why do you -1?
Cal_1_nc <- lm(formula = significant_pos ~ tstat - 1, data = df[df$tstat>1.91 & df$tstat<2.01,])
Cal_1 <- coef_test(Cal_1_nc, cluster=df[df$tstat>1.91 & df$tstat<2.01,]$IDStudy, vcov = "CR2")
print(Cal_1)
#Not sure what the n1/n2 is in table 3 and how to get it. Guessed it would be:
n2_1 <- nrow(df[df$tstat>1.91 & df$tstat<1.96,])
n1_1 <- nrow(df[df$tstat>1.96 & df$tstat<2.01,])

Cal_2_nc <- lm(formula = significant_pos ~ tstat - 1, data = df[df$tstat>1.86 & df$tstat<2.06,])
Cal_2 <- coef_test(Cal_2_nc, cluster=df[df$tstat>1.86 & df$tstat<2.06,]$IDStudy, vcov = "CR2")
print(Cal_2)
n2_2 <- nrow(df[df$tstat>1.86 & df$tstat<1.96,])
n1_2 <- nrow(df[df$tstat>1.96 & df$tstat<2.06,])

Cal_3_nc <- lm(formula = significant_pos ~ tstat - 1, data = df[df$tstat>1.76 & df$tstat<2.16,])
Cal_3 <- coef_test(Cal_3_nc, cluster=df[df$tstat>1.76 & df$tstat<2.16,]$IDStudy, vcov = "CR2")
print(Cal_3)
n2_3 <- nrow(df[df$tstat>1.76 & df$tstat<1.96,])
n1_3 <- nrow(df[df$tstat>1.96 & df$tstat<2.16,])

# -1.96 bound Caliper tests
df$significant_neg <- c(rep(0,nrow(df)))
df$significant_neg[df$tstat< -1.96] <- 1 #laying the groundwork for the regression
Cal_1_nc_2 <- lm(formula = significant_neg ~ tstat - 1, data = df[df$tstat>-2.01 & df$tstat< -1.91,])
Cal_1_2 <- coef_test(Cal_1_nc_2, cluster=df[df$tstat>-2.01 & df$tstat< -1.91,]$IDStudy, vcov = "CR2")
print(Cal_1_2)
n2_4 <- nrow(df[df$tstat> -2.01 & df$tstat< -1.96,])
n1_4 <- nrow(df[df$tstat> -1.96 & df$tstat< -1.91,])

Cal_2_nc_2 <- lm(formula = significant_neg ~ tstat - 1, data = df[df$tstat>-2.06 & df$tstat< -1.86,])
Cal_2_2 <- coef_test(Cal_2_nc_2, cluster=df[df$tstat>-2.06 & df$tstat< -1.86,]$IDStudy, vcov = "CR2")
print(Cal_2_2)
n2_5 <- nrow(df[df$tstat> -2.06 & df$tstat< -1.96,])
n1_5 <- nrow(df[df$tstat> -1.96 & df$tstat< -1.86,])

Cal_3_nc_2 <- lm(formula = significant_neg ~ tstat - 1, data = df[df$tstat>-2.16 & df$tstat< -1.76,])
Cal_3_2 <- coef_test(Cal_3_nc_2, cluster=df[df$tstat>-2.16 & df$tstat< -1.76,]$IDStudy, vcov = "CR2")
print(Cal_3_2)
n2_6 <- nrow(df[df$tstat> -2.16 & df$tstat< -1.96,])
n1_6 <- nrow(df[df$tstat> -1.96 & df$tstat< -1.76,])
```



```{r RTMA}
#install.packages("rstan")
#install.packages("phacking")
library(rstan)
library(phacking)
phacking_RTMA <- phacking_meta(yi = df$fz, vi = df$sefz^2, parallelize = FALSE)
phacking_RTMA$stats
```

```{r MAN}
#install.packages("PublicationBias")
library(PublicationBias)
MAN_reg <- pubbias_meta(df$fz,
                               df$sefz^2,
                               cluster = df$IDStudy,
                               selection_ratio = 4,
                               return_worst_meta = TRUE)

MAN_reg$stats
```


#Funnel plots
```{r Method 1 - RE}
library(meta)
funnel.meta(m.gen.RE,
            studlab = F)
title("Funnel Plot (Random Effects)")
```


```{r Method 2 - using metafor package}
funnel(x=df$fz, sei = df$sefz, yaxis="sei", las=1, refline = REeffects)
###Or 
funnel(m.gen.RE)
```


#Forest Plots
```{r Box plot}
#Box plot
library(tidyverse)
(study_fz <- ggplot(data = df, aes(x = fz, y=reorder(factor(`Study name`, levels = rev(levels(factor(`Study name`)))),-`Publication year`))) +
    geom_boxplot(outlier.colour = "#005CAB", outlier.shape = 21, outlier.fill = "#005CAB", fill="#e6f3ff", color = "#0d4ed1") +
    geom_vline(aes(xintercept = fzsummary['Mean']), color = "red", size = 0.5) +
    labs(title = NULL,x="Estimate of the Fisher's Z between robots and wages", y = NULL))

```
```{r box plot no acemoglu}
#Dataset:
df_boxplot2 <- df[df$IDStudy != 1, ]
df_boxplot2$obsno = 1:nrow(df_boxplot2)
#Box plot
library(tidyverse)
(study_fz2 <- ggplot(data = df_boxplot2, aes(x = fz, y=reorder(factor(`Study name`, levels = rev(levels(factor(`Study name`)))),-`Publication year`))) +
    geom_boxplot(outlier.colour = "#005CAB", outlier.shape = 21, outlier.fill = "#005CAB", fill="#e6f3ff", color = "#0d4ed1") +
    geom_vline(aes(xintercept = fzsummary['Mean']), color = "red", size = 0.5) +
    labs(title = NULL,x="Estimate of the Fisher's Z between robots and wages", y = NULL))
```


```{r study weights for FE forest}
library(magrittr)
library(dplyr)
#got from mitchells code
df$fz_FE <- df$fz*df$w_FE
df_combined_FE <- as.data.frame(df %>%
  group_by(IDStudy) %>%
  summarise(
    fz_FE = sum(fz_FE, na.rm = T),
    w_FE = sum(w_FE, na.rm = T),
    max = max(fz, na.rm = T),
    min = min(fz, na.rm = T),
    studyname = min(`Study name`, na.rm = T) #min worked for some reason so used that
  ))

df_combined_FE$avg_fz_FE <- df_combined_FE$fz_FE/df_combined_FE$w_FE
df_combined_FE$avg_sefz_FE <- sqrt(1/df_combined_FE$w_FE)
df_combined_FE$WT <- 100*df_combined_FE$w_FE/sum(df_combined_FE$w_FE)
df_combined_FE$lower <- df_combined_FE$avg_fz_FE - 1.96*df_combined_FE$avg_sefz_FE
df_combined_FE$upper <- df_combined_FE$avg_fz_FE + 1.96*df_combined_FE$avg_sefz_FE
View(cbind.data.frame(StudyID = df_combined_FE$IDStudy,
                      FEeffect=df_combined_FE$avg_fz_FE, 
                      LowerBound=df_combined_FE$lower,
                      UpperBound=df_combined_FE$upper,
                      WT=df_combined_FE$WT))
summary(as.numeric(df_combined_FE$WT))
df_combined_FE <- df_combined_FE[order(-df_combined_FE$WT),]
head(df_combined_FE)

# Create a dataset with the top 5 highest values of FE weights
top_5_wfe <- df_combined_FE %>%
  arrange(desc(WT)) %>% # Arrange in descending order
  slice(1:5) %>% # Select the top 5 rows
  select(studyname,WT)
# Print the top 5 dataset
print(top_5_wfe)
```

```{r FE forest}
library(metaviz)
viz_thickforest(x = df[, c("fz", "sefz")], 
                       group = df[, "IDStudy"],
                       type="summary_only",
                       method = "FE",
                       study_labels = df[,"Study name"],
                       annotate_CI=TRUE)
### making forest plot using 'metafor' package
forest(df_combined_FE$avg_fz_FE, df_combined_FE$avg_sefz_FE^2, slab=df_combined_FE$studyname, psize=1)
```

```{r study weights for RE forest}
df$fz_RE <- df$fz*df$REweight1
df_combined_RE <- as.data.frame(df %>%
                          group_by(IDStudy) %>%
                          summarise(
                            fz_RE = sum(fz_RE, na.rm = T),
                            REweight1 = sum(REweight1, na.rm = T),
                            max = max(fz, na.rm = T),
                            min = min(fz, na.rm = T),
                            studyname = min(`Study name`, na.rm = T) #min worked for some reason so used that
                          ))
df_combined_RE$avg_fz_RE <- df_combined_RE$fz_RE/df_combined_RE$REweight1
df_combined_RE$avg_sefz_RE <- sqrt(1/df_combined_RE$REweight1)
df_combined_RE$WT <- 100*df_combined_RE$REweight1/sum(df_combined_RE$REweight1)
df_combined_RE$lower <- df_combined_RE$avg_fz_RE - 1.96*df_combined_RE$avg_sefz_RE
df_combined_RE$upper <- df_combined_RE$avg_fz_RE + 1.96*df_combined_RE$avg_sefz_RE
View(cbind.data.frame(StudyID = df_combined_RE$IDStudy,
                      REeffect=df_combined_RE$avg_fz_RE, 
                      LowerBound=df_combined_RE$lower,
                      UpperBound=df_combined_RE$upper,
                      WT=df_combined_RE$WT))
summary(as.numeric(df_combined_RE$WT))
df_combined_RE <- df_combined_RE[order(-df_combined_RE$WT),]
head(df_combined_RE)
#This time paper 1 has a lot of the weight:
# Create a dataset with the top 5 highest values of RE weights
top_5_wre <- df_combined_RE %>%
  arrange(desc(WT)) %>% # Arrange in descending order
  slice(1:5) %>% # Select the top 5 rows
  select(studyname,WT)
# Print the top 5 dataset
print(top_5_wre)
```

```{r RE forest}
viz_thickforest(x = df[, c("fz", "sefz")], 
                       group = df[, "IDStudy"],
                       type="summary_only",
                       method = "REML",
                       annotate_CI=TRUE)
### making forest plot using 'metafor' package
forest(df_combined_RE$avg_fz_RE, df_combined_RE$avg_sefz_RE^2, slab=df_combined_RE$studyname, psize=1)
```

#Bayesian Model Averaging (BMA)
```{r Summary statistics of regression variables}
BMA_data_all <- read_xlsx("bmadata.xlsx", sheet = 'Sheet2') #publication year is pub year - average pub year
#Removed: lagged dep var, routine/non-routine jobs, wage bill, robot dummy, average wage (last three due to collinearity with other variables) 
# Calculate a single stat
calcStat <- function(data, ftype){
  temp_list <- lapply(data, ftype)
  temp_num <- unlist(temp_list)
  return(data.frame(unname(temp_num)))
}
# Calculate all stats and return them as a data frame
calcStats <- function(data){
  varnames <- colnames(data)
  temp <- list()
  desc_stats <- c('mean', 'min', 'max', 'sd')
  for(i in desc_stats){
    stat <- calcStat(data, i)
    temp <- append(temp, stat)
  }
  df <- data.frame(temp)
  colnames(df) <- c('Mean', 'Min', 'Max', 'SD')
  rownames(df) <- varnames
  return(df)
}
BMA_desc <- calcStats(BMA_data_all)
options(scipen = 999)
BMA_desc
```

```{r BMA set up}
BMA_data <- read_xlsx("bmadata.xlsx", sheet = 'Sheet1') #publication year is pub year - average pub year
#Removed: lagged dep var, routine/non-routine jobs, wage bill, robot dummy, average wage (last three due to collinearity with other variables) 
BMA_num <- data.frame(BMA_data, stringsAsFactors = TRUE) #Converting to a data frame
names(BMA_num) <- c("Fisher's Z", "Standard error", "Clustered SE", "Not IFR data", "Published",
                    "Publication year", "Wage Logged", "Wage Hourly", "Wage Weekly", 
                    "Wage Monthly","Wage Annual", "Wages", "Income",
                    "Real", "Earnings", "Wage differences", "Robots logged", "Robots Absolute", "Robots relative",
                    "Robots differences", "Robots Contemporaneous", "Foreign Robots as variable", 
                    "Foreign Robots as control", "Individual level data", 
                    "Firm level data","Industry level data", "Regional level data", "Country/US state level data",
                    "Manufacturing", "Services", "Agriculture", "Young", "Middle-aged/old", "Male", "Female",
                    "Low/medium skilled/educated", "Highly skilled/educated", "Developed countries",
                    "Developing countries", "Time fixed effects", "People fixed effects", "Firm fixed effects",
                    "Industry fixed effects", "Regional fixed effects", "Country/US state fixed effects",
                    "Control: Population", "Control: Gender", "Control: Age", "Control: Education",
                    "Control: Ethnicity",  "Control: Occupation", "Control: Skill",  
                    "Control: Exposure to capital", "Control: Exposure to imports", "Sample weighting",
                    "IV/2SLS", "Addresses outliers")
head(BMA_num)
```

```{r BMA}

library(BMS)

BMA= bms(BMA_num, burn=1e5,iter=3e5, g="UIP", mprior="dilut", nmodel=50000, mcmc="bd", user.int=FALSE) 
print(BMA)
print(BMA$topmod[1])
```

```{r r-squared for top variables}
#Above 0.5
BMA_R2_1 <- lm(formula = `Fisher's Z` ~ `Standard error`+`Not IFR data`+`Wages`+`Income`+`Real`+`Earnings`+`Industry level data`+`Country/US state level data`+`Manufacturing`+`Developing countries`+`Control: Skill`+`Control: Exposure to imports`+`Robots differences`+`Sample weighting`+`Developed countries`+`Country/US state fixed effects`+`Individual level data`+`Foreign Robots as variable`+`People fixed effects`+`Robots logged`+`Published`+`Time fixed effects`+`Wage Logged`, data = BMA_num)
print(summary(BMA_R2_1))
#PIP of 1
BMA_R2_2 <- lm(formula = `Fisher's Z` ~ `Standard error`+`Not IFR data`+`Wages`+`Income`+`Real`+`Earnings`+`Industry level data`+`Country/US state level data`+`Manufacturing`+`Developing countries`+`Control: Skill`+`Control: Exposure to imports`, data = BMA_num)
print(summary(BMA_R2_2))
#Above 0.9
BMA_R2_3 <- lm(formula = `Fisher's Z` ~ `Standard error`+`Not IFR data`+`Wages`+`Income`+`Real`+`Earnings`+`Industry level data`+`Country/US state level data`+`Manufacturing`+`Developing countries`+`Control: Skill`+`Control: Exposure to imports`+`Robots differences`+`Sample weighting`+`Developed countries`+`Country/US state fixed effects`+`Individual level data`+`Foreign Robots as variable`+`People fixed effects`+`Robots logged`+`Published`, data = BMA_num)
print(summary(BMA_R2_3))
```

#Estimate tables
```{r Linear results}
linear_results <- data.frame(Value = c("Not correcting for Publication bias","Unweighted:", "Estimate (Constant)", "Standard error", "Observations", "Weighted:", "By inverse of number of study estimates", "Estimate (Constant)", "Standard error", "Observations", "By precision", "Estimate (Constant)", "Standard error", "Observations", "Correcting for Publication bias (PET)","Unweighted:", "Effect beyond bias (Constant)", "Standard error", "Publication bias (Standard error)", "Standard error","Observations", "Weighted:", "By inverse of number of study estimates", "Effect beyond bias (Constant)", "Standard error", "Publication bias (Standard error)", "Standard error","Observations", "By precision", "Effect beyond bias (Constant)", "Standard error", "Publication bias (Standard error)", "Standard error","Observations"), FE = c("","",FE_c$beta,FE_c$SE,nrow(df),"","",FE_c3$beta,FE_c3$SE,nrow(df),"","","","","","",FE_PET_c$beta[1],FE_PET_c$SE[1],FE_PET_c$beta[2],FE_PET_c$SE[2],nrow(df),"","",FE_PET_c2$beta[1],FE_PET_c2$SE[1],FE_PET_c2$beta[2],FE_PET_c2$SE[2],nrow(df),"","","","","",""), RE = c("","",RE_c$beta,RE_c$SE,nrow(df),"","",RE_c2$beta,RE_c2$SE,nrow(df),"","","","","","",RE_PET_c$beta[1],RE_PET_c$SE[1],RE_PET_c$beta[2],RE_PET_c$SE[2],nrow(df),"","",RE_PET_c2$beta[1],RE_PET_c2$SE[1],RE_PET_c2$beta[2],RE_PET_c2$SE[2],nrow(df),"","","","","",""), OLS = c("","",OLS_C$beta,OLS_C$SE,nrow(df),"","",OLS_studyw_c$beta,OLS_studyw_c$SE,nrow(df),"",OLS_presw_c$beta,OLS_presw_c$SE,nrow(df),"","",OLS_pet_C$beta[1],OLS_pet_C$SE[1],OLS_pet_C$beta[2],OLS_pet_C$SE[2],nrow(df),"","",OLS_pet_studyw_c$beta[1],OLS_pet_studyw_c$SE[1],OLS_pet_studyw_c$beta[2],OLS_pet_studyw_c$SE[2],nrow(df),"",OLS_pet_presw_c$beta[1],OLS_pet_presw_c$SE[1],OLS_pet_presw_c$beta[2],OLS_pet_presw_c$SE[2],nrow(df)))
print (linear_results)
```

```{r Non-linear results}
nonlinear_results <- data.frame(Value = c("Effect beyond bias", "Standard error", "Publication bias", "Standard error","Observations"), WAAP =c(WAAP_reg_cluster$beta,WAAP_reg_cluster$SE, "","",nrow(df)), Top10 =c(T10_reg_cluster$beta,T10_reg_cluster$SE, "","",nrow(df)), Stem =c(est_stem$estimates[1],est_stem$estimates[2], "","",nrow(df)), Kink = c(-.0000437,.0000377,-.5172523,.0315685,nrow(df)))
print (nonlinear_results)
```
```{r Endogeneity-robust techniques}
endogrob_results <- data.frame(Value=c("Effect beyond bias", "Standard error",  "Publication bias (L for p-uniform*)", "Standard error (p-val for p-uniform*)","two-step weak-instrument-robust 95% confidence interval","First-stage robust F-stat" ,"Observations"), IV =c(0.0205428,0.0053854,-1.107136,0.1746162,"{-1.449377, -.7648944}",1174.56,nrow(df)),  puniform=c(p_uni_est$est,p_uni_se,p_uni_est$L.pb,p_uni_est$pval.pb,"","",nrow(df)),puniformstar=c(p_unistar_est$est,p_unistar_se,"","","","",nrow(df)))
print (endogrob_results)
```

```{r caliper results}
caliper_results <- data.frame(Value=c("Caliper width 0.05", "Standard error", "n1/n2", "Caliper width 0.1", "Standard error",  "n1/n2", "Caliper width 0.2", "Standard error",  "n1/n2"), `Threshold 1.96` =c(Cal_1$beta,Cal_1$SE,sprintf("%s/%s", n1_1, n2_1),Cal_2$beta,Cal_2$SE,sprintf("%s/%s", n1_2, n2_2),Cal_3$beta,Cal_3$SE,sprintf("%s/%s", n1_3, n2_3)), `Threshold -1.96`=c(Cal_1_2$beta,Cal_1_2$SE,sprintf("%s/%s", n1_4, n2_4),Cal_2_2$beta,Cal_2_2$SE,sprintf("%s/%s", n1_5, n2_5),Cal_3_2$beta,Cal_3_2$SE,sprintf("%s/%s", n1_6, n2_6)))
print (caliper_results)
```
```{r Linear results with max weights removed - still need to change values}
linear_results_nomax <- data.frame(Value = c("Not correcting for Publication bias","Unweighted:", "Estimate (Constant)", "Standard error", "Observations", "Weighted:", "By inverse of number of study estimates", "Estimate (Constant)", "Standard error", "Observations", "Correcting for Publication bias (PET)","Unweighted:", "Effect beyond bias (Constant)", "Standard error", "Publication bias (Standard error)", "Standard error","Observations", "Weighted:", "By inverse of number of study estimates", "Effect beyond bias (Constant)", "Standard error", "Publication bias (Standard error)", "Standard error","Observations"), FE = c("","",FE_c_nm$beta,FE_c_nm$SE,nrow(df_nomax_fe),"","",FE_c2_nm$beta,FE_c2_nm$SE,nrow(df_nomax_fe),"","",FE_PET_c_nm$beta[1],FE_PET_c_nm$SE[1],FE_PET_c_nm$beta[2],FE_PET_c_nm$SE[2],nrow(df_nomax_fe),"","",FE_PET_c_nm2$beta[1],FE_PET_c_nm2$SE[1],FE_PET_c_nm2$beta[2],FE_PET_c_nm2$SE[2],nrow(df_nomax_fe)), RE = c("","",RE_c_nm$beta,RE_c_nm$SE,nrow(df_nomax_re),"","",RE_c2_nm$beta,RE_c2_nm$SE,nrow(df_nomax_re),"","",RE_PET_c_nm$beta[1],RE_PET_c_nm$SE[1],RE_PET_c_nm$beta[2],RE_PET_c_nm$SE[2],nrow(df_nomax_re),"","",RE_PET_c_nm2$beta[1],RE_PET_c_nm2$SE[1],RE_PET_c_nm2$beta[2],RE_PET_c_nm2$SE[2],nrow(df_nomax_re)))
print (linear_results_nomax)
```

